Slip 4

Q.1. Write a python program to implement k-means algorithm on a mall_customers dataset.

# kmeans_mall_customers.py

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# ------------------------------
# Step 1: Create a Sample Mall Customers Dataset
# ------------------------------
# (You can replace this part with pd.read_csv("Mall_Customers.csv") if you have the dataset)
data = {
    'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Gender': ['Male', 'Male', 'Female', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male'],
    'Age': [19, 21, 20, 23, 31, 40, 35, 23, 64, 30],
    'Annual_Income_k$': [15, 16, 17, 18, 40, 60, 75, 30, 90, 50],
    'Spending_Score': [39, 81, 6, 77, 40, 80, 20, 50, 10, 85]
}

df = pd.DataFrame(data)
print("Mall Customers Dataset:")
print(df, "\n")

# ------------------------------
# Step 2: Select Features for Clustering
# ------------------------------
# Using 'Annual Income' and 'Spending Score' for clustering
X = df[['Annual_Income_k$', 'Spending_Score']]

# ------------------------------
# Step 3: Find Optimal Number of Clusters (Elbow Method)
# ------------------------------
wcss = []  # Within-cluster sum of squares
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

# Plot the Elbow Method
plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.show()

# ------------------------------
# Step 4: Apply K-Means Clustering
# ------------------------------
# Let's assume from the Elbow Method we choose k=4
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42)
y_kmeans = kmeans.fit_predict(X)

# Add cluster labels to the dataset
df['Cluster'] = y_kmeans

print("Clustered Data:")
print(df, "\n")

# ------------------------------
# Step 5: Visualize the Clusters
# ------------------------------
plt.figure(figsize=(8, 6))
colors = ['red', 'blue', 'green', 'purple']

for i in range(4):
    plt.scatter(
        X.values[y_kmeans == i, 0],
        X.values[y_kmeans == i, 1],
        s=100,
        c=colors[i],
        label=f'Cluster {i+1}'
    )

# Plot centroids
plt.scatter(
    kmeans.cluster_centers_[:, 0],
    kmeans.cluster_centers_[:, 1],
    s=200,
    c='yellow',
    marker='X',
    label='Centroids'
)

plt.title('K-Means Clustering of Mall Customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1–100)')
plt.legend()
plt.show()

Q.2. Write a python program to implement simple Linear Regression for predicting house 
price. 

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Step 1: Load the dataset
# (You can replace 'house_prices.csv' with your actual dataset file)
df = pd.read_csv("house_prices.csv")

print("✅ Dataset loaded successfully!\n")
print("First 5 rows:\n", df.head(), "\n")

# Step 2: Define features (X) and target (y)
# Assuming dataset has columns like 'Area' (sq ft) and 'Price'
# Modify column names as per your dataset
X = df[['Area']]   # independent variable
y = df['Price']    # dependent variable

# Step 3: Split the dataset into training and testing sets (80%-20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 5: Make predictions
y_pred = model.predict(X_test)

# Step 6: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("📊 Model Evaluation:")
print("Mean Squared Error:", round(mse, 2))
print("R² Score:", round(r2, 4))
print("Intercept (b0):", round(model.intercept_, 2))
print("Slope (b1):", round(model.coef_[0], 2))

# Step 7: Visualize the results
plt.scatter(X_test, y_test, color='blue', label='Actual Prices')
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Regression Line')
plt.title("Simple Linear Regression - House Price Prediction")
plt.xlabel("Area (sq ft)")
plt.ylabel("Price")
plt.legend()
plt.show()

# Step 8: Example prediction for new input
area_value = [[2500]]  # Example: 2500 sq ft
predicted_price = model.predict(area_value)
print(f"\n🏠 Predicted price for {area_value[0][0]} sq ft = ${predicted_price[0]:.2f}")